In [1]:
from PIL import Image, ImageStat, ImageCms
import glob
import os.path, os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
imgdir = 'images2' # downloaded images
In [3]:
df_1 = pd.read_csv('df_nonLinear.csv').drop(columns='Unnamed: 0')
In [4]:
df_2 = df_1[df_1.Artwork_Image.notna()]
df_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7947 entries, 4 to 9375
Data columns (total 35 columns):
Artist_Name              7947 non-null object
Auction_Year             7947 non-null int64
Lot_ID                   7947 non-null int64
Artwork_Image            7947 non-null object
Lot_No                   7946 non-null float64
Status                   7947 non-null object
Lot_Estimate_Low_USD     7639 non-null float64
Lot_Estimate_High_USD    7660 non-null float64
SalePrice_USD            7947 non-null int64
Auction_Title            7947 non-null object
Auction_Date             7947 non-null object
Price_Type               7947 non-null object
Artwork_Width            7947 non-null float64
Artwork_Height           7947 non-null float64
Artwork_Year_Start       5931 non-null float64
Painting_Title           7947 non-null object
Provenance               7947 non-null object
Literature               7947 non-null object
Aspect_Ratio             7947 non-null float64
Orientation              7947 non-null object
Gender                   7947 non-null object
AliveAtAuction           7947 non-null object
Age_Work_Start           5926 non-null float64
Artwork_Area             7947 non-null float64
Auction.House            7947 non-null object
Auction.Location         7947 non-null object
Auction.Currency         7947 non-null object
Medium                   7947 non-null object
NameOfArtist             7947 non-null object
YearOfAuction            7947 non-null int64
CPI                      7947 non-null float64
AdjustedSalePrice        7947 non-null float64
Price                    7947 non-null object
Old_Index                7947 non-null float64
Index                    7947 non-null float64
dtypes: float64(13), int64(4), object(18)
memory usage: 2.2+ MB
In [5]:
url_prefix = os.path.commonprefix(df_2.Artwork_Image.tolist()) # replace http://artinfo-images-350.s3.amazonaws.com
df_3 = df_2.assign(Artwork_Image_Path=df_2.Artwork_Image.apply(lambda x: x.replace(url_prefix, imgdir + "/")))
df_3.shape
Out[5]:
(7947, 36)
In [6]:
df_3.head()
Out[6]:
Artist_Name Auction_Year Lot_ID Artwork_Image Lot_No Status Lot_Estimate_Low_USD Lot_Estimate_High_USD SalePrice_USD Auction_Title ... Auction.Currency Medium NameOfArtist YearOfAuction CPI AdjustedSalePrice Price Old_Index Index Artwork_Image_Path
4 A A Raiba 2016 6449966 http://artinfo-images-350.s3.amazonaws.com/asi... 17.0 Sold 10500.0 13500.0 25200 Evening Sale ... INR OIL ON CANVAS A A Raiba 2016 241.432 26103.593558 Low 0.00454 0.000392 images2/asi2-125006/17.jpg
5 A A Raiba 2017 6645121 http://artinfo-images-350.s3.amazonaws.com/asi... 26.0 Sold 22350.0 29800.0 23244 Evening Sale ... INR OTHER-OIL A A Raiba 2017 246.524 23580.133034 Low 0.00454 0.000392 images2/asi2-129104/26.jpg
8 A A Raiba 2008 3627464 http://artinfo-images-350.s3.amazonaws.com/mis... 63.0 Sold 5000.0 7000.0 6250 Modern & Contemporary Indian Art ... USD OIL ON CANVAS A A Raiba 2008 210.228 7435.052657 Low 0.00454 0.000392 images2/missingImages/0887180/63.jpg
10 A A Raiba 2017 6645120 http://artinfo-images-350.s3.amazonaws.com/asi... 25.0 Sold 7450.0 10430.0 13410 Evening Sale ... INR OTHER-OIL A A Raiba 2017 246.524 13603.922904 Low 0.00454 0.000392 images2/asi2-129104/25.jpg
12 A A Raiba 2015 5996406 http://artinfo-images-350.s3.amazonaws.com/asi... 58.0 Sold 3052.0 4577.0 11443 The Fine Art Sale - Including Works from the C... ... INR OIL ON CANVAS A A Raiba 2015 236.525 12099.221761 Low 0.00454 0.000392 images2/asi2-120614/58.jpg

5 rows × 36 columns

In [7]:
# drop rows if Artwork_Image_Path not exists
df_4 = df_3[df_3.Artwork_Image_Path.map(lambda x: os.path.exists(x))]
df_4.shape
Out[7]:
(7897, 36)
In [8]:
def rgb_avg(x):
    with Image.open(x) as im: 
        return map(lambda x: round(x,3), ImageStat.Stat(im).mean)
def as_dataframe(avgs):
    return pd.DataFrame(avgs.values.tolist(), index=avgs.index, columns='R G B'.split())
    
df_5 = df_4.join(as_dataframe(df_4.Artwork_Image_Path.map(rgb_avg)))
df_5.head()
Out[8]:
Artist_Name Auction_Year Lot_ID Artwork_Image Lot_No Status Lot_Estimate_Low_USD Lot_Estimate_High_USD SalePrice_USD Auction_Title ... YearOfAuction CPI AdjustedSalePrice Price Old_Index Index Artwork_Image_Path R G B
4 A A Raiba 2016 6449966 http://artinfo-images-350.s3.amazonaws.com/asi... 17.0 Sold 10500.0 13500.0 25200 Evening Sale ... 2016 241.432 26103.593558 Low 0.00454 0.000392 images2/asi2-125006/17.jpg 97.616 83.346 69.424
5 A A Raiba 2017 6645121 http://artinfo-images-350.s3.amazonaws.com/asi... 26.0 Sold 22350.0 29800.0 23244 Evening Sale ... 2017 246.524 23580.133034 Low 0.00454 0.000392 images2/asi2-129104/26.jpg 77.174 69.588 61.304
8 A A Raiba 2008 3627464 http://artinfo-images-350.s3.amazonaws.com/mis... 63.0 Sold 5000.0 7000.0 6250 Modern & Contemporary Indian Art ... 2008 210.228 7435.052657 Low 0.00454 0.000392 images2/missingImages/0887180/63.jpg 142.482 130.763 114.555
10 A A Raiba 2017 6645120 http://artinfo-images-350.s3.amazonaws.com/asi... 25.0 Sold 7450.0 10430.0 13410 Evening Sale ... 2017 246.524 13603.922904 Low 0.00454 0.000392 images2/asi2-129104/25.jpg 163.368 127.609 71.317
12 A A Raiba 2015 5996406 http://artinfo-images-350.s3.amazonaws.com/asi... 58.0 Sold 3052.0 4577.0 11443 The Fine Art Sale - Including Works from the C... ... 2015 236.525 12099.221761 Low 0.00454 0.000392 images2/asi2-120614/58.jpg 114.307 113.882 102.117

5 rows × 39 columns

In [9]:
X = np.array(df_5[['R','G','B']])
In [10]:
kmeans = KMeans(n_clusters=8, random_state=0).fit(X)
kmeans
Out[10]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
In [11]:
kmeans.n_clusters
Out[11]:
8
In [12]:
kmeans.get_params()
Out[12]:
{'algorithm': 'auto',
 'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 8,
 'n_init': 10,
 'n_jobs': None,
 'precompute_distances': 'auto',
 'random_state': 0,
 'tol': 0.0001,
 'verbose': 0}
In [13]:
# cluster labels and number of items in them
pd.DataFrame.from_records(np.unique(kmeans.labels_, return_counts=True)).T.rename(
    columns={0:'label',1:'count'}).set_index('label')
Out[13]:
count
label
0 941
1 942
2 1248
3 1284
4 709
5 1479
6 428
7 866
In [14]:
df_6 = df_5.assign(kmean_cluster=kmeans.labels_)
In [15]:
df_6.to_csv("df_kmean.csv",index=False)
In [16]:
# pick random images for each label
n_imgs = 8
samples = dict(map(lambda label: (label, 
                                  df_6[df_6.kmean_cluster==label][['R','G','B',
                                                               'Artwork_Image_Path',
                                                               'kmean_cluster']].sample(n_imgs)),
                    df_6.kmean_cluster.unique()))
In [17]:
def render_images(data, label):
    grid_rows = 2
    grid_cols = data.shape[0]/grid_rows
    f, axarr = plt.subplots(grid_rows, grid_cols, figsize=(15,10))
    i = 0
    for r in range(grid_rows):
        for c in range(grid_cols):
            img = data.iloc[i]
            with Image.open(img.Artwork_Image_Path) as x:
                ax = axarr[r, c]
                ax.imshow(x)
                ax.axis('off')
                ax.set_title("RGB[{},{},{}] {}\n{}".format(
                    img.R.astype(int),
                    img.G.astype(int),
                    img.B.astype(int),
                    img.kmean_cluster,
                    img.Artwork_Image_Path.replace(imgdir+"/",'')))
                i = i + 1
    _ = plt.suptitle("label={}".format(label))
In [18]:
render_images(samples[0],0)
In [19]:
render_images(samples[1],1)
In [20]:
render_images(samples[2],2)
In [21]:
render_images(samples[3],3)
In [22]:
render_images(samples[4],4)
In [23]:
render_images(samples[5],5)
In [24]:
render_images(samples[6],6)
In [25]:
render_images(samples[7],7)